In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px #Data Visualisation


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/youtube-new/FRvideos.csv
/kaggle/input/youtube-new/JPvideos.csv
/kaggle/input/youtube-new/KRvideos.csv
/kaggle/input/youtube-new/CAvideos.csv
/kaggle/input/youtube-new/USvideos.csv
/kaggle/input/youtube-new/RU_category_id.json
/kaggle/input/youtube-new/DE_category_id.json
/kaggle/input/youtube-new/US_category_id.json
/kaggle/input/youtube-new/GBvideos.csv
/kaggle/input/youtube-new/KR_category_id.json
/kaggle/input/youtube-new/IN_category_id.json
/kaggle/input/youtube-new/JP_category_id.json
/kaggle/input/youtube-new/MXvideos.csv
/kaggle/input/youtube-new/CA_category_id.json
/kaggle/input/youtube-new/GB_category_id.json
/kaggle/input/youtube-new/MX_category_id.json
/kaggle/input/youtube-new/INvideos.csv
/kaggle/input/youtube-new/RUvideos.csv
/kaggle/input/youtube-new/DEvideos.csv
/kaggle/input/youtube-new/FR_category_id.json
In [2]:
#Explorational data analysis of Indian Trending videos
 #Imported dataset
df=pd.read_csv("/kaggle/input/youtube-new/INvideos.csv")
df
Out[2]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description
0 kzwfHumJyYc 17.14.11 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12T12:20:39.000Z sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 882 https://i.ytimg.com/vi/kzwfHumJyYc/default.jpg False False False Presenting Sharry Mann latest Punjabi Song Cu...
1 zUZ1z7FwLc8 17.14.11 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13T05:43:56.000Z पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 0 https://i.ytimg.com/vi/zUZ1z7FwLc8/default.jpg True False False पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...
2 10L1hZ9qa58 17.14.11 Stylish Star Allu Arjun @ ChaySam Wedding Rece... TFPC 24 2017-11-12T15:48:08.000Z Stylish Star Allu Arjun @ ChaySam Wedding Rece... 473988 2011 243 149 https://i.ytimg.com/vi/10L1hZ9qa58/default.jpg False False False Watch Stylish Star Allu Arjun @ ChaySam Weddin...
3 N1vE8iiEg64 17.14.11 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12T07:08:48.000Z Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 2684 https://i.ytimg.com/vi/N1vE8iiEg64/default.jpg False False False This video showcases the difference between pe...
4 kJzGH0PVQHQ 17.14.11 why Samantha became EMOTIONAL @ Samantha naga ... Filmylooks 24 2017-11-13T01:14:16.000Z Filmylooks|"latest news"|"telugu movies"|"telu... 464015 492 293 66 https://i.ytimg.com/vi/kJzGH0PVQHQ/default.jpg False False False why Samantha became EMOTIONAL @ Samantha naga ...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 18.14.06 फेकू आशिक़ - राजस्थान की सबसे शानदार कॉमेडी | ... RDC Rajasthani 23 2018-06-13T08:01:11.000Z twinkle vaishnav comedy|"twinkle vaishnav"|"tw... 214378 3291 404 196 https://i.ytimg.com/vi/iNHecA3PJCo/default.jpg False False False PRG Music & RDC Rajasthani presents फेकू आशिक़...
37348 dpPmPbhcslM 18.14.06 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13T11:30:04.000Z flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 1428 https://i.ytimg.com/vi/dpPmPbhcslM/default.jpg False False False Flowers - A R Rahman Show,Book your Tickets He...
37349 mV6aztP58f8 18.14.06 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13T05:00:02.000Z mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 697 https://i.ytimg.com/vi/mV6aztP58f8/default.jpg False False False Subscribe to Mazhavil Manorama now for your da...
37350 qxqDNP1bDEw 18.14.06 Nua Bohu | Full Ep 285 | 13th June 2018 | Odia... Tarang TV 24 2018-06-13T15:07:49.000Z tarang|"tarang tv"|"tarang tv online"|"tarang ... 130263 698 115 65 https://i.ytimg.com/vi/qxqDNP1bDEw/default.jpg False False False Nuabohu : Story of a rustic village girl who w...
37351 wERgpPK44w0 18.14.06 Ee Nagaraniki Emaindi Trailer | Tharun Bhascke... Suresh Productions 24 2018-06-10T04:29:54.000Z Ee Nagaraniki Emaindi|"Ee Nagaraniki Emaindi T... 1278249 22466 1609 1205 https://i.ytimg.com/vi/wERgpPK44w0/default.jpg False False False Check out Ee Nagaraniki Emaindi Trailer #EeNag...

37352 rows × 16 columns

In [3]:
df.tail()
Out[3]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description
37347 iNHecA3PJCo 18.14.06 फेकू आशिक़ - राजस्थान की सबसे शानदार कॉमेडी | ... RDC Rajasthani 23 2018-06-13T08:01:11.000Z twinkle vaishnav comedy|"twinkle vaishnav"|"tw... 214378 3291 404 196 https://i.ytimg.com/vi/iNHecA3PJCo/default.jpg False False False PRG Music & RDC Rajasthani presents फेकू आशिक़...
37348 dpPmPbhcslM 18.14.06 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13T11:30:04.000Z flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 1428 https://i.ytimg.com/vi/dpPmPbhcslM/default.jpg False False False Flowers - A R Rahman Show,Book your Tickets He...
37349 mV6aztP58f8 18.14.06 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13T05:00:02.000Z mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 697 https://i.ytimg.com/vi/mV6aztP58f8/default.jpg False False False Subscribe to Mazhavil Manorama now for your da...
37350 qxqDNP1bDEw 18.14.06 Nua Bohu | Full Ep 285 | 13th June 2018 | Odia... Tarang TV 24 2018-06-13T15:07:49.000Z tarang|"tarang tv"|"tarang tv online"|"tarang ... 130263 698 115 65 https://i.ytimg.com/vi/qxqDNP1bDEw/default.jpg False False False Nuabohu : Story of a rustic village girl who w...
37351 wERgpPK44w0 18.14.06 Ee Nagaraniki Emaindi Trailer | Tharun Bhascke... Suresh Productions 24 2018-06-10T04:29:54.000Z Ee Nagaraniki Emaindi|"Ee Nagaraniki Emaindi T... 1278249 22466 1609 1205 https://i.ytimg.com/vi/wERgpPK44w0/default.jpg False False False Check out Ee Nagaraniki Emaindi Trailer #EeNag...
In [4]:
df.shape[0] #No of records present in the csv file
Out[4]:
37352
In [5]:
df['video_id'].unique().shape[0] #No. of unique videos present in the trending list
Out[5]:
16307

It can be seen that the number of records do not match the number of unique videos present in the trending list. This is natural as the video can be on the trending list for any number of months or days.

In [6]:
df.describe()
Out[6]:
category_id views likes dislikes comment_count
count 37352.000000 3.735200e+04 3.735200e+04 3.735200e+04 37352.00000
mean 21.576596 1.060478e+06 2.708272e+04 1.665082e+03 2676.99743
std 6.556593 3.184932e+06 9.714510e+04 1.607617e+04 14868.31713
min 1.000000 4.024000e+03 0.000000e+00 0.000000e+00 0.00000
25% 23.000000 1.239155e+05 8.640000e+02 1.080000e+02 81.00000
50% 24.000000 3.045860e+05 3.069000e+03 3.260000e+02 329.00000
75% 24.000000 7.992912e+05 1.377425e+04 1.019250e+03 1285.00000
max 43.000000 1.254322e+08 2.912710e+06 1.545017e+06 827755.00000
In [7]:
#Null values
df.isnull().any()
Out[7]:
video_id                  False
trending_date             False
title                     False
channel_title             False
category_id               False
publish_time              False
tags                      False
views                     False
likes                     False
dislikes                  False
comment_count             False
thumbnail_link            False
comments_disabled         False
ratings_disabled          False
video_error_or_removed    False
description                True
dtype: bool
In [8]:
new_df=df[["category_id","views","likes","dislikes","comment_count","comments_disabled","ratings_disabled","video_error_or_removed"]]
new_df #Abstracting some data
Out[8]:
category_id views likes dislikes comment_count comments_disabled ratings_disabled video_error_or_removed
0 1 1096327 33966 798 882 False False False
1 25 590101 735 904 0 True False False
2 24 473988 2011 243 149 False False False
3 23 1242680 70353 1624 2684 False False False
4 24 464015 492 293 66 False False False
... ... ... ... ... ... ... ... ...
37347 23 214378 3291 404 196 False False False
37348 24 406828 1726 478 1428 False False False
37349 24 386319 1216 453 697 False False False
37350 24 130263 698 115 65 False False False
37351 24 1278249 22466 1609 1205 False False False

37352 rows × 8 columns

In [9]:
#Encoding the data
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
In [10]:
#Label Encoder can only take one dimensional column
new_df.iloc[:,5]=lb.fit_transform(new_df.iloc[:,5]) #comments_disabled
new_df.iloc[:,6]=lb.fit_transform(new_df.iloc[:,6]) #ratings_disabled
new_df.iloc[:,7]=lb.fit_transform(new_df.iloc[:,7]) #video_error_or_removed
new_df
/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py:1745: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
Out[10]:
category_id views likes dislikes comment_count comments_disabled ratings_disabled video_error_or_removed
0 1 1096327 33966 798 882 0 0 0
1 25 590101 735 904 0 1 0 0
2 24 473988 2011 243 149 0 0 0
3 23 1242680 70353 1624 2684 0 0 0
4 24 464015 492 293 66 0 0 0
... ... ... ... ... ... ... ... ...
37347 23 214378 3291 404 196 0 0 0
37348 24 406828 1726 478 1428 0 0 0
37349 24 386319 1216 453 697 0 0 0
37350 24 130263 698 115 65 0 0 0
37351 24 1278249 22466 1609 1205 0 0 0

37352 rows × 8 columns

In [11]:
new_df.describe()
Out[11]:
category_id views likes dislikes comment_count comments_disabled ratings_disabled video_error_or_removed
count 37352.000000 3.735200e+04 3.735200e+04 3.735200e+04 37352.00000 37352.000000 37352.000000 37352.000000
mean 21.576596 1.060478e+06 2.708272e+04 1.665082e+03 2676.99743 0.032234 0.020909 0.000294
std 6.556593 3.184932e+06 9.714510e+04 1.607617e+04 14868.31713 0.176623 0.143082 0.017159
min 1.000000 4.024000e+03 0.000000e+00 0.000000e+00 0.00000 0.000000 0.000000 0.000000
25% 23.000000 1.239155e+05 8.640000e+02 1.080000e+02 81.00000 0.000000 0.000000 0.000000
50% 24.000000 3.045860e+05 3.069000e+03 3.260000e+02 329.00000 0.000000 0.000000 0.000000
75% 24.000000 7.992912e+05 1.377425e+04 1.019250e+03 1285.00000 0.000000 0.000000 0.000000
max 43.000000 1.254322e+08 2.912710e+06 1.545017e+06 827755.00000 1.000000 1.000000 1.000000
In [12]:
#See the correlation
import seaborn as sns
df.corr()
Out[12]:
category_id views likes dislikes comment_count comments_disabled ratings_disabled video_error_or_removed
category_id 1.000000 -0.183215 -0.127600 -0.042450 -0.036223 0.038928 0.057953 -0.053865
views -0.183215 1.000000 0.850096 0.543294 0.674195 -0.034227 -0.034364 0.004976
likes -0.127600 0.850096 1.000000 0.494560 0.780514 -0.045624 -0.040741 0.028464
dislikes -0.042450 0.543294 0.494560 1.000000 0.708125 -0.011821 -0.015136 0.002985
comment_count -0.036223 0.674195 0.780514 0.708125 1.000000 -0.032860 -0.024210 0.015771
comments_disabled 0.038928 -0.034227 -0.045624 -0.011821 -0.032860 1.000000 0.354718 -0.003132
ratings_disabled 0.057953 -0.034364 -0.040741 -0.015136 -0.024210 0.354718 1.000000 -0.002508
video_error_or_removed -0.053865 0.004976 0.028464 0.002985 0.015771 -0.003132 -0.002508 1.000000
In [13]:
df=df[df["video_id"]!="#NAME?"] #removing data that has error video_id
df
Out[13]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description
0 kzwfHumJyYc 17.14.11 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12T12:20:39.000Z sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 882 https://i.ytimg.com/vi/kzwfHumJyYc/default.jpg False False False Presenting Sharry Mann latest Punjabi Song Cu...
1 zUZ1z7FwLc8 17.14.11 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13T05:43:56.000Z पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 0 https://i.ytimg.com/vi/zUZ1z7FwLc8/default.jpg True False False पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं...
2 10L1hZ9qa58 17.14.11 Stylish Star Allu Arjun @ ChaySam Wedding Rece... TFPC 24 2017-11-12T15:48:08.000Z Stylish Star Allu Arjun @ ChaySam Wedding Rece... 473988 2011 243 149 https://i.ytimg.com/vi/10L1hZ9qa58/default.jpg False False False Watch Stylish Star Allu Arjun @ ChaySam Weddin...
3 N1vE8iiEg64 17.14.11 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12T07:08:48.000Z Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 2684 https://i.ytimg.com/vi/N1vE8iiEg64/default.jpg False False False This video showcases the difference between pe...
4 kJzGH0PVQHQ 17.14.11 why Samantha became EMOTIONAL @ Samantha naga ... Filmylooks 24 2017-11-13T01:14:16.000Z Filmylooks|"latest news"|"telugu movies"|"telu... 464015 492 293 66 https://i.ytimg.com/vi/kJzGH0PVQHQ/default.jpg False False False why Samantha became EMOTIONAL @ Samantha naga ...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 18.14.06 फेकू आशिक़ - राजस्थान की सबसे शानदार कॉमेडी | ... RDC Rajasthani 23 2018-06-13T08:01:11.000Z twinkle vaishnav comedy|"twinkle vaishnav"|"tw... 214378 3291 404 196 https://i.ytimg.com/vi/iNHecA3PJCo/default.jpg False False False PRG Music & RDC Rajasthani presents फेकू आशिक़...
37348 dpPmPbhcslM 18.14.06 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13T11:30:04.000Z flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 1428 https://i.ytimg.com/vi/dpPmPbhcslM/default.jpg False False False Flowers - A R Rahman Show,Book your Tickets He...
37349 mV6aztP58f8 18.14.06 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13T05:00:02.000Z mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 697 https://i.ytimg.com/vi/mV6aztP58f8/default.jpg False False False Subscribe to Mazhavil Manorama now for your da...
37350 qxqDNP1bDEw 18.14.06 Nua Bohu | Full Ep 285 | 13th June 2018 | Odia... Tarang TV 24 2018-06-13T15:07:49.000Z tarang|"tarang tv"|"tarang tv online"|"tarang ... 130263 698 115 65 https://i.ytimg.com/vi/qxqDNP1bDEw/default.jpg False False False Nuabohu : Story of a rustic village girl who w...
37351 wERgpPK44w0 18.14.06 Ee Nagaraniki Emaindi Trailer | Tharun Bhascke... Suresh Productions 24 2018-06-10T04:29:54.000Z Ee Nagaraniki Emaindi|"Ee Nagaraniki Emaindi T... 1278249 22466 1609 1205 https://i.ytimg.com/vi/wERgpPK44w0/default.jpg False False False Check out Ee Nagaraniki Emaindi Trailer #EeNag...

36841 rows × 16 columns

In [14]:
sns.heatmap(df.corr(),annot=True) #Higher the amount, the more the correlation
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3e4d98ac10>

Correlation Analysis

  1. We can see that the columns views,likes,dislikes,comment_count are highly correlated with each other. Especially, there seems to be a high correlation between views and likes.
  2. There's also a small correlation between comments_disabled and ratings_disabled column.
  3. But it becomes apparent that there is no correlation with the category_id and video_error_or_removed columns. So they can be removed from our analysis methods.

According to Youtube Help for what makes a video trending:

Trending aims to balance all of these considerations. To achieve this, Trending considers many signals, including (but not limited to):

  • View count
  • How quickly the video is generating views (i.e. “temperature”)
  • Where views are coming from, including outside of YouTube
  • The age of the video
  • How the video performs compared to other recent uploads from the same channel

With the current dataset we cannot find where the views are coming from or how the other videos from the same channel are performing because it is not specified. We can only focus on the other three considerations and see the correlation.

Exploratory Data Analysis

1. Based on Categories

Extracting the types of categories

In [15]:
import json
with open('/kaggle/input/youtube-new/IN_category_id.json') as f: #Indian trending videos
  data = json.load(f)
print(data)
{'kind': 'youtube#videoCategoryListResponse', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/1v2mrzYSYG6onNLt2qTj13hkQZk"', 'items': [{'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/Xy1mB4_yLrHy_BmKmPBggty2mZQ"', 'id': '1', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Film & Animation', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/UZ1oLIIz2dxIhO45ZTFR3a3NyTA"', 'id': '2', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Autos & Vehicles', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/nqRIq97-xe5XRZTxbknKFVe5Lmg"', 'id': '10', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Music', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/HwXKamM1Q20q9BN-oBJavSGkfDI"', 'id': '15', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Pets & Animals', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/9GQMSRjrZdHeb1OEM1XVQ9zbGec"', 'id': '17', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Sports', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/FJwVpGCVZ1yiJrqZbpqe68Sy_OE"', 'id': '18', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Short Movies', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/M-3iD9dwK7YJCafRf_DkLN8CouA"', 'id': '19', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Travel & Events', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/WmA0qYEfjWsAoyJFSw2zinhn2wM"', 'id': '20', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Gaming', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/EapFaGYG7K0StIXVf8aba249tdM"', 'id': '21', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Videoblogging', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/xId8RX7vRN8rqkbYZbNIytUQDRo"', 'id': '22', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'People & Blogs', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/G9LHzQmx44rX2S5yaga_Aqtwz8M"', 'id': '23', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Comedy', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/UVB9oxX2Bvqa_w_y3vXSLVK5E_s"', 'id': '24', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Entertainment', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/QiLK0ZIrFoORdk_g2l_XR_ECjDc"', 'id': '25', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'News & Politics', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/r6Ck6Z0_L0rG37VJQR200SGNA_w"', 'id': '26', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Howto & Style', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/EoYkczo9I3RCf96RveKTOgOPkUM"', 'id': '27', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Education', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/w5HjcTD82G_XA3xBctS30zS-JpQ"', 'id': '28', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Science & Technology', 'assignable': True}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/lL7uWDr_071CHxifjYG1tJrp4Uo"', 'id': '30', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Movies', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/WnuVfjO-PyFLO7NTRQIbrGE62nk"', 'id': '31', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Anime/Animation', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/ctpH2hGA_UZ3volJT_FTlOg9M00"', 'id': '32', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Action/Adventure', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/L0kR3-g1BAo5UD1PLVbQ7LkkDtQ"', 'id': '33', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Classics', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/pUZOAC_s9sfiwar639qr_wAB-aI"', 'id': '34', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Comedy', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/Xb5JLhtyNRN3AQq021Ds-OV50Jk"', 'id': '35', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Documentary', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/u8WXzF4HIhtEi805__sqjuA4lEk"', 'id': '36', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Drama', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/D04PP4Gr7wc4IV_O9G66Z4A8KWQ"', 'id': '37', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Family', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/i5-_AceGXQCEEMWU0V8CcQm_vLQ"', 'id': '38', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Foreign', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/rtlxd0zOixA9QHdIZB26-St5qgQ"', 'id': '39', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Horror', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/N1TrDFLRppxZgBowCJfJCvh0Dpg"', 'id': '40', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Sci-Fi/Fantasy', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/7UMGi6zRySqXopr_rv4sZq6Za2E"', 'id': '41', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Thriller', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/RScXhi324h8usyIetreAVb-uKeM"', 'id': '42', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Shorts', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/0n9MJVCDLpA8q7aiGVrFsuFsd0A"', 'id': '43', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Shows', 'assignable': False}}, {'kind': 'youtube#videoCategory', 'etag': '"XI7nbFXulYBIpL0ayR_gDh3eu1k/x5NxSf5fz8hn4loSN4rvhwzD_pY"', 'id': '44', 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ', 'title': 'Trailers', 'assignable': False}}]}
In [16]:
categories_dict={}
print("Different types of categories: "+str(len(data["items"])))
for i in range(len(data["items"])):
    cat_id=data["items"][i]["id"]
    title=data["items"][i]["snippet"]["title"]
    categories_dict[int(cat_id)]=title
print(categories_dict)
    
Different types of categories: 31
{1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventure', 33: 'Classics', 34: 'Comedy', 35: 'Documentary', 36: 'Drama', 37: 'Family', 38: 'Foreign', 39: 'Horror', 40: 'Sci-Fi/Fantasy', 41: 'Thriller', 42: 'Shorts', 43: 'Shows', 44: 'Trailers'}

Mapping the category id to the category names

In [17]:
df['category'] = df['category_id'].map(categories_dict) 
df.head(5)
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
Out[17]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description category
0 kzwfHumJyYc 17.14.11 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12T12:20:39.000Z sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 882 https://i.ytimg.com/vi/kzwfHumJyYc/default.jpg False False False Presenting Sharry Mann latest Punjabi Song Cu... Film & Animation
1 zUZ1z7FwLc8 17.14.11 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13T05:43:56.000Z पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 0 https://i.ytimg.com/vi/zUZ1z7FwLc8/default.jpg True False False पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... News & Politics
2 10L1hZ9qa58 17.14.11 Stylish Star Allu Arjun @ ChaySam Wedding Rece... TFPC 24 2017-11-12T15:48:08.000Z Stylish Star Allu Arjun @ ChaySam Wedding Rece... 473988 2011 243 149 https://i.ytimg.com/vi/10L1hZ9qa58/default.jpg False False False Watch Stylish Star Allu Arjun @ ChaySam Weddin... Entertainment
3 N1vE8iiEg64 17.14.11 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12T07:08:48.000Z Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 2684 https://i.ytimg.com/vi/N1vE8iiEg64/default.jpg False False False This video showcases the difference between pe... Comedy
4 kJzGH0PVQHQ 17.14.11 why Samantha became EMOTIONAL @ Samantha naga ... Filmylooks 24 2017-11-13T01:14:16.000Z Filmylooks|"latest news"|"telugu movies"|"telu... 464015 492 293 66 https://i.ytimg.com/vi/kJzGH0PVQHQ/default.jpg False False False why Samantha became EMOTIONAL @ Samantha naga ... Entertainment
In [18]:
arr=df["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(df.loc[df["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(df, values=sum_of_videos, names=arr,title="Categories of Indian trending videos")
fig.show()

Comparing it with different countries

In [19]:
import json
with open('/kaggle/input/youtube-new/US_category_id.json') as f: #US trending videos
  data_us = json.load(f)

with open('/kaggle/input/youtube-new/DE_category_id.json') as f: #Indian trending videos
  data_ger = json.load(f)
In [20]:
categories_dict_us={}
print("Different types of categories in US: "+str(len(data["items"])))
for i in range(len(data_us["items"])):
    cat_id=data_us["items"][i]["id"]
    title=data_us["items"][i]["snippet"]["title"]
    categories_dict_us[int(cat_id)]=title
print(categories_dict_us)
categories_dict_ger={}
print("Different types of categories in Ger: "+str(len(data["items"])))
for i in range(len(data_ger["items"])):
    cat_id=data_ger["items"][i]["id"]
    title=data_ger["items"][i]["snippet"]["title"]
    categories_dict_ger[int(cat_id)]=title
print(categories_dict_ger)
Different types of categories in US: 31
{1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 29: 'Nonprofits & Activism', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventure', 33: 'Classics', 34: 'Comedy', 35: 'Documentary', 36: 'Drama', 37: 'Family', 38: 'Foreign', 39: 'Horror', 40: 'Sci-Fi/Fantasy', 41: 'Thriller', 42: 'Shorts', 43: 'Shows', 44: 'Trailers'}
Different types of categories in Ger: 31
{1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventure', 33: 'Classics', 34: 'Comedy', 35: 'Documentary', 36: 'Drama', 37: 'Family', 38: 'Foreign', 39: 'Horror', 40: 'Sci-Fi/Fantasy', 41: 'Thriller', 42: 'Shorts', 43: 'Shows', 44: 'Trailers'}
In [21]:
df_us=pd.read_csv("/kaggle/input/youtube-new/USvideos.csv") #US VIDEOS data
df_ger=pd.read_csv("/kaggle/input/youtube-new/DEvideos.csv") #German videos data
In [22]:
df_us['category'] = df_us['category_id'].map(categories_dict_us) 
df_ger['category'] = df_ger['category_id'].map(categories_dict_ger) 
In [23]:
arr=df_us["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(df_us.loc[df_us["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(df_us, values=sum_of_videos, names=arr,title="Categories of US trending videos")
fig.show()
In [24]:
arr=df_ger["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(df_ger.loc[df_ger["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(df_ger, values=sum_of_videos, names=arr,title="Categories of Germany trending videos")
fig.show()

Results

Most of the Indian trending videos belong to the category of "Entertainment". Also both the German and US trending videos had similar statistics with highest number of videos coming from the "Entertainment" category. Although percentage of the "Entertainment" category in India is more than the other two countries.

2. Videos on Trending list

Trending videos repeating in the list

In [25]:
df.groupby('video_id').size().sort_values(ascending=False).reset_index(name="count").iloc[1:].head(10) 
#Indicates that there are videos which come on trending list more than once
Out[25]:
video_id count
1 l7E0kTvARsA 14
2 1J76wN0TPI4 14
3 y-PQiShdTKA 13
4 C1Pn5Ln1R4M 13
5 yFFL1we4j_Y 13
6 WDiK14qI3pQ 13
7 bYSRPuDEnTg 13
8 aNwWdF8qq-M 12
9 ZnVIUr_BQSs 12
10 mMCEvr3VWqQ 12
In [26]:
df.loc[df["video_id"]=="rRr1qiJRsXk"]
Out[26]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description category
28059 rRr1qiJRsXk 18.25.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 18639195 511763 15606 32435 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28204 rRr1qiJRsXk 18.25.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 18639195 511768 15606 32435 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28254 rRr1qiJRsXk 18.26.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 28572753 646403 21140 40610 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28381 rRr1qiJRsXk 18.26.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 28572753 646404 21140 40610 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28431 rRr1qiJRsXk 18.27.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 32679759 689918 23886 42689 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28570 rRr1qiJRsXk 18.27.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 32679759 689920 23885 42689 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28620 rRr1qiJRsXk 18.28.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 36202030 719461 26170 44326 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28766 rRr1qiJRsXk 18.28.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 36202030 719461 26170 44335 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28817 rRr1qiJRsXk 18.29.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 39207474 742221 28091 45614 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
28956 rRr1qiJRsXk 18.29.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 39207474 742221 28091 45614 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
29011 rRr1qiJRsXk 18.30.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 41978366 761124 29512 46747 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
29145 rRr1qiJRsXk 18.30.04 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 41978366 761124 29512 46747 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
29209 rRr1qiJRsXk 18.01.05 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 42738154 768406 29892 47104 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
29339 rRr1qiJRsXk 18.01.05 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 42738154 768406 29892 47104 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
29431 rRr1qiJRsXk 18.02.05 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 43553343 775542 30350 47514 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment
29668 rRr1qiJRsXk 18.03.05 Sanju | Official Teaser | Ranbir Kapoor | Rajk... FoxStarHindi 24 2018-04-24T07:58:08.000Z Sanju Teaser|"Official Teaser"|"Sanju Official... 44171797 780895 30791 47829 https://i.ytimg.com/vi/rRr1qiJRsXk/default.jpg False False False Few lives in our times are as dramatic and eni... Entertainment

How many times does a video trend

In [27]:
videos_appeared_most = df.groupby('video_id').size().sort_values(ascending=False)
indexes = videos_appeared_most.index.values
tdf = df[df['video_id'].isin(indexes)].sort_values(
    by='trending_date', ascending=True).drop_duplicates(subset=['video_id'])
tdf['trending_days'] = tdf['video_id'].map(videos_appeared_most)
tdf.sort_values(by='trending_days', ascending=False, inplace=True)
tdf = tdf[['video_id', 'title', 'trending_days', 'views', 'likes', 
           'dislikes', 'comment_count', 'category','channel_title']]
tdf.head(11)
Out[27]:
video_id title trending_days views likes dislikes comment_count category channel_title
29339 rRr1qiJRsXk Sanju | Official Teaser | Ranbir Kapoor | Rajk... 16 42738154 768406 29892 47104 Entertainment FoxStarHindi
34969 1J76wN0TPI4 Sanju | Official Trailer | Ranbir Kapoor | Raj... 14 23758250 587326 18799 43728 Entertainment FoxStarHindi
34973 l7E0kTvARsA Golak Bugni Bank Te Batua Full Movie (HD) | Ha... 14 781977 22023 783 1096 Movies Rhythm Boyz
19345 yFFL1we4j_Y Mill Lo Na - Guri Ft. Sukhe (Full Song) Jaani ... 13 6101350 186861 11451 14413 Music Geet MP3
35832 C1Pn5Ln1R4M कस्टमर केयर वाली छोरिया | राजस्थानी सुपरहिट कॉ... 13 75248 1572 170 101 Comedy RDC Rajasthani
34970 WDiK14qI3pQ Restaurant Sutiyapa | Ashish Chanchlani 13 5163751 360860 17071 28990 Comedy ashish chanchlani vines
26132 bYSRPuDEnTg Garmi Ke Side-Effects | Ashish Chanchlani 13 4227847 387792 9279 24003 Comedy ashish chanchlani vines
32322 y-PQiShdTKA Tochan (Full Video) | SIDHU MOOSEWALA | BYG BY... 13 5659320 112530 3180 10480 Music Humble Music
35572 5hpkINjCf7A Result Ka Mahool - Amit Bhadana 12 3385971 326085 9541 38656 Entertainment Amit Bhadana
29301 Tx0fhKJRaeU Bhai Bhai Ka Pyaar - Part 2 | Harsh Beniwal 12 4375355 328356 6380 19368 Comedy Harsh Beniwal
33029 XgVrYVBs2fc Desi Vs Others ( Relationship Ki Kahani ) - Am... 12 3519671 302345 9251 34740 Entertainment Amit Bhadana
In [28]:
df['trending_days'] = df['video_id'].map(videos_appeared_most)
df
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[28]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description category trending_days
0 kzwfHumJyYc 17.14.11 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12T12:20:39.000Z sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 882 https://i.ytimg.com/vi/kzwfHumJyYc/default.jpg False False False Presenting Sharry Mann latest Punjabi Song Cu... Film & Animation 3
1 zUZ1z7FwLc8 17.14.11 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13T05:43:56.000Z पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 0 https://i.ytimg.com/vi/zUZ1z7FwLc8/default.jpg True False False पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... News & Politics 6
2 10L1hZ9qa58 17.14.11 Stylish Star Allu Arjun @ ChaySam Wedding Rece... TFPC 24 2017-11-12T15:48:08.000Z Stylish Star Allu Arjun @ ChaySam Wedding Rece... 473988 2011 243 149 https://i.ytimg.com/vi/10L1hZ9qa58/default.jpg False False False Watch Stylish Star Allu Arjun @ ChaySam Weddin... Entertainment 4
3 N1vE8iiEg64 17.14.11 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12T07:08:48.000Z Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 2684 https://i.ytimg.com/vi/N1vE8iiEg64/default.jpg False False False This video showcases the difference between pe... Comedy 4
4 kJzGH0PVQHQ 17.14.11 why Samantha became EMOTIONAL @ Samantha naga ... Filmylooks 24 2017-11-13T01:14:16.000Z Filmylooks|"latest news"|"telugu movies"|"telu... 464015 492 293 66 https://i.ytimg.com/vi/kJzGH0PVQHQ/default.jpg False False False why Samantha became EMOTIONAL @ Samantha naga ... Entertainment 5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 18.14.06 फेकू आशिक़ - राजस्थान की सबसे शानदार कॉमेडी | ... RDC Rajasthani 23 2018-06-13T08:01:11.000Z twinkle vaishnav comedy|"twinkle vaishnav"|"tw... 214378 3291 404 196 https://i.ytimg.com/vi/iNHecA3PJCo/default.jpg False False False PRG Music & RDC Rajasthani presents फेकू आशिक़... Comedy 2
37348 dpPmPbhcslM 18.14.06 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13T11:30:04.000Z flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 1428 https://i.ytimg.com/vi/dpPmPbhcslM/default.jpg False False False Flowers - A R Rahman Show,Book your Tickets He... Entertainment 2
37349 mV6aztP58f8 18.14.06 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13T05:00:02.000Z mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 697 https://i.ytimg.com/vi/mV6aztP58f8/default.jpg False False False Subscribe to Mazhavil Manorama now for your da... Entertainment 2
37350 qxqDNP1bDEw 18.14.06 Nua Bohu | Full Ep 285 | 13th June 2018 | Odia... Tarang TV 24 2018-06-13T15:07:49.000Z tarang|"tarang tv"|"tarang tv online"|"tarang ... 130263 698 115 65 https://i.ytimg.com/vi/qxqDNP1bDEw/default.jpg False False False Nuabohu : Story of a rustic village girl who w... Entertainment 2
37351 wERgpPK44w0 18.14.06 Ee Nagaraniki Emaindi Trailer | Tharun Bhascke... Suresh Productions 24 2018-06-10T04:29:54.000Z Ee Nagaraniki Emaindi|"Ee Nagaraniki Emaindi T... 1278249 22466 1609 1205 https://i.ytimg.com/vi/wERgpPK44w0/default.jpg False False False Check out Ee Nagaraniki Emaindi Trailer #EeNag... Entertainment 10

36841 rows × 18 columns

In [29]:
arr=tdf["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(tdf.loc[tdf["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(tdf, values=sum_of_videos, names=arr,title="Categories of trending videos which appear more number of times on the trending list")
fig.show()
In [30]:
sns.heatmap(tdf.corr(),annot=True)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3e4c39dcd0>

No. of times the video comes on trending list has some correlation. It has about 0.5 correlation which is pretty good. Hence it will be included in our model.

Videos that have more dislikes than likes

In [31]:
df['temp1'] = np.where((df['dislikes'] > df['likes']) ,True, False)
df.loc[df["temp1"]==True] #total of 645 rows with more dislikes than likes
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[31]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description category trending_days temp1
1 zUZ1z7FwLc8 17.14.11 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13T05:43:56.000Z पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 0 https://i.ytimg.com/vi/zUZ1z7FwLc8/default.jpg True False False पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... News & Politics 6 True
24 znOC3IU0dF8 17.14.11 Hero Tarun at #ChaySamWeddingReception | Saman... News Mantra 24 2017-11-13T06:03:49.000Z Hero Tarun At Samantha Naga Chaitanya Wedding ... 55178 45 94 15 https://i.ytimg.com/vi/znOC3IU0dF8/default.jpg False False False NaN Entertainment 1 True
96 qgasli1u-Ag 17.14.11 AUNTY KI GHANTI PART 2 | FULL MUSIC VIDEO Mallika 23 2017-11-11T08:34:08.000Z lyricist|"aunty ki gunty rap"|"aunty ki ganti ... 108414 3771 5349 2491 https://i.ytimg.com/vi/qgasli1u-Ag/default.jpg False False False Hey Internet!\nThank you for watching this wee... Comedy 1 True
113 PBuCqgA7u5w 17.14.11 Abhishek Bachchan Lashes Out As Paparazzi Clic... Business Of Cinema 24 2017-11-08T11:34:35.000Z businessofcinema|"boc"|"bollywood"|"abhishek b... 2664065 2351 4169 883 https://i.ytimg.com/vi/PBuCqgA7u5w/default.jpg False False False While attending a party Aishwarya Rai Bachchan... Entertainment 1 True
133 XVPpvn9vQDU 17.14.11 Ammuvinte Amma l The person who seeking for Am... Mazhavil Manorama 24 2017-11-12T05:30:00.000Z Mazhavil|"Manorama"|"Ammuvinte Amma"|"Serial"|... 96306 52 75 41 https://i.ytimg.com/vi/XVPpvn9vQDU/default.jpg False False False Watch Ammuvinte Amma Monday to Saturday at 7.... Entertainment 1 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37036 pS5Ng5r8cds 18.13.06 Nani Initiates Legal Proceedings Against Sri R... ETV Telangana 25 2018-06-11T17:18:54.000Z ETV|"ETVTelugu"|"ETV NewsVideo"|"National News... 133607 350 381 297 https://i.ytimg.com/vi/pS5Ng5r8cds/default.jpg False False False Nani Initiates Legal Proceedings Against Sri R... News & Politics 4 True
37155 smcUETnz2xY 18.13.06 Sri Reddy Reacts On Nani's Legal Notices | Fil... Film Jalsa 24 2018-06-11T14:44:03.000Z Film Jalsa|"Sri Reddy Reacts On Nani's Legal N... 180791 610 2783 675 https://i.ytimg.com/vi/smcUETnz2xY/default.jpg False False False Sri Reddy Reacts On Nani's Legal Notices | Fil... Entertainment 5 True
37156 pS5Ng5r8cds 18.13.06 Nani Initiates Legal Proceedings Against Sri R... ETV Telangana 25 2018-06-11T17:18:54.000Z ETV|"ETVTelugu"|"ETV NewsVideo"|"National News... 133607 350 381 297 https://i.ytimg.com/vi/pS5Ng5r8cds/default.jpg False False False Nani Initiates Legal Proceedings Against Sri R... News & Politics 4 True
37245 smcUETnz2xY 18.14.06 Sri Reddy Reacts On Nani's Legal Notices | Fil... Film Jalsa 24 2018-06-11T14:44:03.000Z Film Jalsa|"Sri Reddy Reacts On Nani's Legal N... 313505 917 3977 945 https://i.ytimg.com/vi/smcUETnz2xY/default.jpg False False False Sri Reddy Reacts On Nani's Legal Notices | Fil... Entertainment 5 True
37257 pS5Ng5r8cds 18.14.06 Nani Initiates Legal Proceedings Against Sri R... ETV Telangana 25 2018-06-11T17:18:54.000Z ETV|"ETVTelugu"|"ETV NewsVideo"|"National News... 208154 477 629 448 https://i.ytimg.com/vi/pS5Ng5r8cds/default.jpg False False False Nani Initiates Legal Proceedings Against Sri R... News & Politics 4 True

636 rows × 19 columns

Channel with highest number of videos on trending list

In [32]:
total_channels=df.groupby("channel_title").size().reset_index(name="video_count").sort_values("video_count", ascending=False).head(20)
total_channels #top 20 channels and their number of videos
Out[32]:
channel_title video_count
1375 etvteluguindia 280
1272 VikatanTV 280
352 Flowers Comedy 270
297 ETV Plus India 251
857 RadaanMedia 240
917 SET India 237
912 SAB TV 237
295 ETV Jabardasth 234
1391 mallemalatv 228
1121 Tarang TV 223
1270 Vijay Television 223
1049 T-Series 215
635 Mazhavil Manorama 214
1127 Technical Guruji 214
1033 Study IQ education 211
1245 V6 News Telugu 210
1413 zeetv 194
1038 SunTV Tamil 194
1016 Speed Records 193
1414 zeetvtelugu 193
In [33]:
fig = px.bar(total_channels, x="video_count", y="channel_title",color='video_count',title="Top 20 channels", labels={"video_count":"Number of videos on the trending list","channel_title":"Channel"})
fig.show()
In [34]:
temp1=df.loc[df["channel_title"]=="VikatanTV"].groupby("category").size().reset_index(name="cat_count")
temp1.plot.pie(y="cat_count",subplots=True, figsize=(11, 6),labels=temp1["category"]) #Channel with most trending videos having categories
Out[34]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f3e4c209550>],
      dtype=object)

Comparing with other countries

In [35]:
total_channels_us=df_us.groupby("channel_title").size().reset_index(name="video_count").sort_values("video_count", ascending=False).head(20)
fig = px.bar(total_channels_us, x="video_count", y="channel_title",color='video_count',title="Top 20 channels", labels={"video_count":"Number of videos of US on the trending list","channel_title":"Channel"})
fig.show()
In [36]:
total_channels_ger=df_ger.groupby("channel_title").size().reset_index(name="video_count").sort_values("video_count", ascending=False).head(20)
fig = px.bar(total_channels_ger, x="video_count", y="channel_title",color='video_count',title="Top 20 channels", labels={"video_count":"Number of videos of Germany on the trending list","channel_title":"Channel"})
fig.show()
In [37]:
temp1=df_us.loc[df_us["channel_title"]=="ESPN"].groupby("category").size().reset_index(name="cat_count")
temp1.plot.pie(y="cat_count",subplots=True, figsize=(11, 6),labels=temp1["category"]) #Channel with most trending videos in US
temp2=df_ger.loc[df_ger["channel_title"]=="Galileo"].groupby("category").size().reset_index(name="cat_count")
temp2.plot.pie(y="cat_count",subplots=True, figsize=(11, 6),labels=temp2["category"]) #Channel with most trending videos of germany
Out[37]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f3e4c149350>],
      dtype=object)

Results

To our surprise, the channel which is in the trending list with most number of videos in India is mostly belonging to the "Shows" category. Similarly, while analyzing the data of "ESPN" channel, which is the channel in US with most number of videos in the trending list, the category it published is of "Sports" category despite it's low percentage in overall categories. Even the channel in Germany has few videos from "Education" category eventhough it has less percentage.

It can indicate the fact that the reason for those videos being on the trending list is because of their "specificity", "rarity" or "relatable nature of the content to only a specific subgroup.

Although we got some surprising results in this one occasion, categories does not have that big correlation with other data. Hence, it may be relieved from the analysis.

3. Dates

Converting format of the trending dates

In [38]:
df["trending_date"]=pd.to_datetime(df["trending_date"],format="%y.%d.%m")
df["publish_time"]=pd.to_datetime(df["publish_time"])
df=df.assign(trending_day=df.trending_date.dt.day,trending_month=df.trending_date.dt.month,trending_year=df.trending_date.dt.year)
df=df.assign(publish_day=df.publish_time.dt.day,publish_month=df.publish_time.dt.month,publish_year=df.publish_time.dt.year)
df["publishing_day"] = df["publish_time"].dt.day_name()
df["publishing_hour"] = df["publish_time"].dt.hour
df
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[38]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_days temp1 trending_day trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour
0 kzwfHumJyYc 2017-11-14 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12 12:20:39+00:00 sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 ... 3 False 14 11 2017 12 11 2017 Sunday 12
1 zUZ1z7FwLc8 2017-11-14 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13 05:43:56+00:00 पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 ... 6 True 14 11 2017 13 11 2017 Monday 5
2 10L1hZ9qa58 2017-11-14 Stylish Star Allu Arjun @ ChaySam Wedding Rece... TFPC 24 2017-11-12 15:48:08+00:00 Stylish Star Allu Arjun @ ChaySam Wedding Rece... 473988 2011 243 ... 4 False 14 11 2017 12 11 2017 Sunday 15
3 N1vE8iiEg64 2017-11-14 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12 07:08:48+00:00 Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 ... 4 False 14 11 2017 12 11 2017 Sunday 7
4 kJzGH0PVQHQ 2017-11-14 why Samantha became EMOTIONAL @ Samantha naga ... Filmylooks 24 2017-11-13 01:14:16+00:00 Filmylooks|"latest news"|"telugu movies"|"telu... 464015 492 293 ... 5 False 14 11 2017 13 11 2017 Monday 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 2018-06-14 फेकू आशिक़ - राजस्थान की सबसे शानदार कॉमेडी | ... RDC Rajasthani 23 2018-06-13 08:01:11+00:00 twinkle vaishnav comedy|"twinkle vaishnav"|"tw... 214378 3291 404 ... 2 False 14 6 2018 13 6 2018 Wednesday 8
37348 dpPmPbhcslM 2018-06-14 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13 11:30:04+00:00 flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 ... 2 False 14 6 2018 13 6 2018 Wednesday 11
37349 mV6aztP58f8 2018-06-14 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13 05:00:02+00:00 mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 ... 2 False 14 6 2018 13 6 2018 Wednesday 5
37350 qxqDNP1bDEw 2018-06-14 Nua Bohu | Full Ep 285 | 13th June 2018 | Odia... Tarang TV 24 2018-06-13 15:07:49+00:00 tarang|"tarang tv"|"tarang tv online"|"tarang ... 130263 698 115 ... 2 False 14 6 2018 13 6 2018 Wednesday 15
37351 wERgpPK44w0 2018-06-14 Ee Nagaraniki Emaindi Trailer | Tharun Bhascke... Suresh Productions 24 2018-06-10 04:29:54+00:00 Ee Nagaraniki Emaindi|"Ee Nagaraniki Emaindi T... 1278249 22466 1609 ... 10 False 14 6 2018 10 6 2018 Sunday 4

36841 rows × 27 columns

Most popular publishing day of the videos

In [39]:
counted=df.groupby("publishing_day").size().reset_index(name="count")
fig = px.bar(counted, x="publishing_day", y="count", title="Most popular day to publish a trending video")
fig.show()

Friday can be seen as the most popular day to publish trending videos maybe because of the start of the weekend.

Most popular publishing hour

In [40]:
counted=df.groupby("publishing_hour").size().reset_index(name="count")
fig = px.bar(counted, x="publishing_hour", y="count", title="Most popular time to publish a trending video")
fig.show()

2pm seems to be the time where most videos are published. Both the publishing day and published hour seem to have not that much significance. All other days and hours have similar or normal distribution.

How long till the video becomes trending?

In [41]:
df['temp'] = np.where((df['trending_year'] != df['publish_year']) ,True, False) #Checks if the years are different
df.loc[df["temp"]==True]
Out[41]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... temp1 trending_day trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp
9059 0vV36TV0oms 2018-01-01 Kodakaa Koteswar Rao Full Song || Agnyaathavaa... Aditya Music 10 2017-12-31 12:36:38+00:00 Kodakaa Koteswar Rao|"Kodakaa Koteswar Rao Son... 3608261 255063 14272 ... False 1 1 2018 31 12 2017 Sunday 12 True
9060 5g7SFnNuDU8 2018-01-01 AIB : Honest House Parties | Part 2 All India Bakchod 23 2017-12-30 05:22:49+00:00 All India Bakchod|"AIB"|"AllIndiaBakchod"|"Tan... 1271287 29493 2080 ... False 1 1 2018 30 12 2017 Saturday 5 True
9061 Ku8PsnI9Mw0 2018-01-01 चिल्लर से कार खरीदने पंहुचा ये शख्स, फिर जो हु... HJ NEWS 25 2017-12-30 11:30:01+00:00 चिल्लर से खरीदी इस शख्स ने कार|"दुकानदारों की ... 1865515 6596 1721 ... False 1 1 2018 30 12 2017 Saturday 11 True
9062 SKU2lZRP71c 2018-01-01 पतला दुपट्टा तेरा मुँह दिखे | Patla Dupatta Te... Sonotek Haryanvi 10 2017-12-30 09:01:12+00:00 Sonotek Haryanvi|"पतला दुपट्टा तेरा मुँह दिखे"... 772035 3864 1205 ... False 1 1 2018 30 12 2017 Saturday 9 True
9063 ZRas5gKKIp0 2018-01-01 Irumbuthirai Official Teaser | Vishal, Arjun, ... Think Music India 1 2017-12-29 13:36:25+00:00 irumbuthirai|"irumbu thirai"|"vishal"|"samanth... 2189462 60540 3677 ... False 1 1 2018 29 12 2017 Friday 13 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9982 wGZ8MfccVaA 2018-01-05 SUPERHIT HD VIDEO SONG # खेसारी लाल यादव का Su... Khesari Music World 10 2017-12-31 00:30:00+00:00 bhojpuri hit songs|"bhojpuri hot song"|"bhojpu... 2520976 19193 3354 ... False 5 1 2018 31 12 2017 Sunday 0 True
10093 0vV36TV0oms 2018-01-06 Kodakaa Koteswar Rao Full Song || Agnyaathavaa... Aditya Music 10 2017-12-31 12:36:38+00:00 Kodakaa Koteswar Rao|"Kodakaa Koteswar Rao Son... 9377347 334328 25127 ... False 6 1 2018 31 12 2017 Sunday 12 True
16097 yXcHPvPT3II 2018-02-10 viva harsha Interview - new 2017 II Sneha Tali... Hamsa 4 U 1 2017-12-23 14:58:36+00:00 Hamsa Movies|"Hamsa Productions"|"Hamsa Kidzee... 117023 817 66 ... False 10 2 2018 23 12 2017 Saturday 14 True
16321 yXcHPvPT3II 2018-02-11 viva harsha Interview - new 2017 II Sneha Tali... Hamsa 4 U 1 2017-12-23 14:58:36+00:00 Hamsa Movies|"Hamsa Productions"|"Hamsa Kidzee... 210173 1354 117 ... False 11 2 2018 23 12 2017 Saturday 14 True
16566 yXcHPvPT3II 2018-02-12 viva harsha Interview - new 2017 II Sneha Tali... Hamsa 4 U 1 2017-12-23 14:58:36+00:00 Hamsa Movies|"Hamsa Productions"|"Hamsa Kidzee... 256408 1530 130 ... False 12 2 2018 23 12 2017 Saturday 14 True

397 rows × 28 columns

In [42]:
df["publish_year"].loc[df["temp"]==True].unique()
Out[42]:
array([2017])
In [43]:
df["publish_month"].loc[df["temp"]==True].unique() #Months where this 2017 year's videos are published on
Out[43]:
array([12,  5])
In [44]:
#Videos that were published in "December" of 2017 and Trended in 2018
temp1=df.loc[df["temp"]==True]
temp1=temp1.loc[temp1["publish_month"]==12]
temp1["trending_month"].unique()
Out[44]:
array([1, 2])
In [45]:
#Viral videos that remerge suddenly
temp2=df.loc[df["temp"]==True]
temp2.loc[temp2["publish_month"]==5]
Out[45]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... temp1 trending_day trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp
9377 T85l5uiBz1U 2018-01-02 Drunk and Drive Test to Anchor Pradeep, Ali & ... Film City 24 2017-05-27 20:27:30+00:00 Hyderabad traffic police|"drunk and drive"|"dr... 38068 46 39 ... False 2 1 2018 27 5 2017 Saturday 20 True
9618 T85l5uiBz1U 2018-01-03 Drunk and Drive Test to Anchor Pradeep, Ali & ... Film City 24 2017-05-27 20:27:30+00:00 Hyderabad traffic police|"drunk and drive"|"dr... 58175 57 66 ... True 3 1 2018 27 5 2017 Saturday 20 True

2 rows × 28 columns

This shows that the trending year and the year when the video was published are different. So some videos which have been published long time ago can resurface because of their virality. Most of the viral videos, nowadays, work this way. Like in the above table it can be seen that 400 videos were published in 2017 but only 2 of them are published in the month of "May" of 2017 and trended in "January" of 2018. Other videos were trending just a month or so later after publishing in "December" which is pretty normal.

In [46]:
df["publish_date"]=pd.to_datetime(df["publish_time"].dt.date) #to get the date to subtract the dates
df
Out[46]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_day trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date
0 kzwfHumJyYc 2017-11-14 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12 12:20:39+00:00 sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 ... 14 11 2017 12 11 2017 Sunday 12 False 2017-11-12
1 zUZ1z7FwLc8 2017-11-14 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13 05:43:56+00:00 पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 ... 14 11 2017 13 11 2017 Monday 5 False 2017-11-13
2 10L1hZ9qa58 2017-11-14 Stylish Star Allu Arjun @ ChaySam Wedding Rece... TFPC 24 2017-11-12 15:48:08+00:00 Stylish Star Allu Arjun @ ChaySam Wedding Rece... 473988 2011 243 ... 14 11 2017 12 11 2017 Sunday 15 False 2017-11-12
3 N1vE8iiEg64 2017-11-14 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12 07:08:48+00:00 Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 ... 14 11 2017 12 11 2017 Sunday 7 False 2017-11-12
4 kJzGH0PVQHQ 2017-11-14 why Samantha became EMOTIONAL @ Samantha naga ... Filmylooks 24 2017-11-13 01:14:16+00:00 Filmylooks|"latest news"|"telugu movies"|"telu... 464015 492 293 ... 14 11 2017 13 11 2017 Monday 1 False 2017-11-13
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 2018-06-14 फेकू आशिक़ - राजस्थान की सबसे शानदार कॉमेडी | ... RDC Rajasthani 23 2018-06-13 08:01:11+00:00 twinkle vaishnav comedy|"twinkle vaishnav"|"tw... 214378 3291 404 ... 14 6 2018 13 6 2018 Wednesday 8 False 2018-06-13
37348 dpPmPbhcslM 2018-06-14 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13 11:30:04+00:00 flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 ... 14 6 2018 13 6 2018 Wednesday 11 False 2018-06-13
37349 mV6aztP58f8 2018-06-14 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13 05:00:02+00:00 mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 ... 14 6 2018 13 6 2018 Wednesday 5 False 2018-06-13
37350 qxqDNP1bDEw 2018-06-14 Nua Bohu | Full Ep 285 | 13th June 2018 | Odia... Tarang TV 24 2018-06-13 15:07:49+00:00 tarang|"tarang tv"|"tarang tv online"|"tarang ... 130263 698 115 ... 14 6 2018 13 6 2018 Wednesday 15 False 2018-06-13
37351 wERgpPK44w0 2018-06-14 Ee Nagaraniki Emaindi Trailer | Tharun Bhascke... Suresh Productions 24 2018-06-10 04:29:54+00:00 Ee Nagaraniki Emaindi|"Ee Nagaraniki Emaindi T... 1278249 22466 1609 ... 14 6 2018 10 6 2018 Sunday 4 False 2018-06-10

36841 rows × 29 columns

In [47]:
df["days_before_trend"]=(df["trending_date"]-df["publish_date"]).dt.days #Created the column called "days_before_trend"
df.loc[:,"trending_date":"days_before_trend"]
Out[47]:
trending_date title channel_title category_id publish_time tags views likes dislikes comment_count ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
0 2017-11-14 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12 12:20:39+00:00 sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 882 ... 11 2017 12 11 2017 Sunday 12 False 2017-11-12 2
1 2017-11-14 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13 05:43:56+00:00 पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 0 ... 11 2017 13 11 2017 Monday 5 False 2017-11-13 1
2 2017-11-14 Stylish Star Allu Arjun @ ChaySam Wedding Rece... TFPC 24 2017-11-12 15:48:08+00:00 Stylish Star Allu Arjun @ ChaySam Wedding Rece... 473988 2011 243 149 ... 11 2017 12 11 2017 Sunday 15 False 2017-11-12 2
3 2017-11-14 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12 07:08:48+00:00 Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 2684 ... 11 2017 12 11 2017 Sunday 7 False 2017-11-12 2
4 2017-11-14 why Samantha became EMOTIONAL @ Samantha naga ... Filmylooks 24 2017-11-13 01:14:16+00:00 Filmylooks|"latest news"|"telugu movies"|"telu... 464015 492 293 66 ... 11 2017 13 11 2017 Monday 1 False 2017-11-13 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37347 2018-06-14 फेकू आशिक़ - राजस्थान की सबसे शानदार कॉमेडी | ... RDC Rajasthani 23 2018-06-13 08:01:11+00:00 twinkle vaishnav comedy|"twinkle vaishnav"|"tw... 214378 3291 404 196 ... 6 2018 13 6 2018 Wednesday 8 False 2018-06-13 1
37348 2018-06-14 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13 11:30:04+00:00 flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 1428 ... 6 2018 13 6 2018 Wednesday 11 False 2018-06-13 1
37349 2018-06-14 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13 05:00:02+00:00 mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 697 ... 6 2018 13 6 2018 Wednesday 5 False 2018-06-13 1
37350 2018-06-14 Nua Bohu | Full Ep 285 | 13th June 2018 | Odia... Tarang TV 24 2018-06-13 15:07:49+00:00 tarang|"tarang tv"|"tarang tv online"|"tarang ... 130263 698 115 65 ... 6 2018 13 6 2018 Wednesday 15 False 2018-06-13 1
37351 2018-06-14 Ee Nagaraniki Emaindi Trailer | Tharun Bhascke... Suresh Productions 24 2018-06-10 04:29:54+00:00 Ee Nagaraniki Emaindi|"Ee Nagaraniki Emaindi T... 1278249 22466 1609 1205 ... 6 2018 10 6 2018 Sunday 4 False 2018-06-10 4

36841 rows × 29 columns

In [48]:
fig = px.box(df, y="days_before_trend")
fig.show()
In [49]:
temp=df[['category_id','views', 'likes','dislikes','comment_count','trending_days','days_before_trend']]
sns.heatmap(temp.corr(),annot=True)
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3e4b5f6f50>

Results: Eventhough it showed data about some outliers and how old videos can emerge as viral videos, it holds no significance or correlation with other data. So it can be removed when we are using data for prediction purposes.

4. Views, Likes, Dislikes, Comment count

Number of Views

In [50]:
df['views'].min() #Minimum view count
Out[50]:
4024
In [51]:
df['views'].max()#Maximum view count
Out[51]:
125432237
In [52]:
df.loc[df["views"]==125432237] #Video with most number of views
Out[52]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
5408 FlsCjmMhFmw 2017-12-12 YouTube Rewind: The Shape of 2017 | #YouTubeRe... YouTube Spotlight 24 2017-12-06 17:58:51+00:00 Rewind|"Rewind 2017"|"youtube rewind 2017"|"#Y... 125432237 2912710 1545017 ... 12 2017 6 12 2017 Wednesday 17 False 2017-12-06 6

1 rows × 30 columns

In [53]:
fig=px.strip(data_frame=df, x=df["views"]) #Views Distribution
fig.show()

Likes

In [54]:
df['likes'].min() #Minimum likes count
Out[54]:
0
In [55]:
df.loc[df["likes"]==0] #Videos with zero likes
Out[55]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
34 jElRtesCnlA 2017-11-14 Breaking News IT Raid - நடந்தது என்ன? சிக்கியத... Sathiyam News 25 2017-11-13 07:59:26+00:00 sathiyam news today|"sathiyam news live today"... 33533 0 0 ... 11 2017 13 11 2017 Monday 7 False 2017-11-13 1
36 qP67alYxSiU 2017-11-14 కెమెరాలో రికార్డ్ అయిన ఈ అద్భుతాన్ని చూస్తే ఆశ... HOTNEWS TELUGU 26 2017-11-12 03:53:49+00:00 Miracles Caught on Camera|"Miracles Caught on ... 128851 0 0 ... 11 2017 12 11 2017 Sunday 3 False 2017-11-12 2
43 q0-SCcsuSSs 2017-11-14 Akkineni Nagarjuna Response On Annapurna Studi... I News 25 2017-11-13 15:30:47+00:00 I news|"i news telugu channel"|"i news youtube... 23932 0 0 ... 11 2017 13 11 2017 Monday 15 False 2017-11-13 1
70 7BUEKSFwk7Q 2017-11-14 Mission IAS 2018 : 13 November, 2017 The Hindu... only ias 27 2017-11-13 03:13:24+00:00 [none] 23114 0 0 ... 11 2017 13 11 2017 Monday 3 False 2017-11-13 1
128 fRY7UWGFvdc 2017-11-14 ഇടുക്കി ആശുപത്രിയിൽ രോഗികളെ ക്യൂവിൽ നിർത്തിയ ജ... Malayalam News Time 22 2017-11-12 11:04:59+00:00 [none] 197122 0 0 ... 11 2017 12 11 2017 Sunday 11 False 2017-11-12 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
36833 QEiskdHs8I8 2018-06-11 Gangster Sampat Nehra arrested by Haryana poli... TV9 Telugu 25 2018-06-10 08:51:52+00:00 tv9|"tv9 live"|"tv9 news"|"live tv9"|"tv9 telu... 89609 0 0 ... 6 2018 10 6 2018 Sunday 8 False 2018-06-10 1
36950 ihOPILF66Z8 2018-06-12 I AM 30 | Sindhu Loknath | V Vikas | Vikram Yo... SAKKATH STUDIO 24 2018-06-11 02:30:02+00:00 loose connection|"loose connection kannada"|"l... 39697 0 0 ... 6 2018 11 6 2018 Monday 2 False 2018-06-11 1
37038 zVtDaUn6iYg 2018-06-13 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 154412 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 1
37158 zVtDaUn6iYg 2018-06-13 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 154412 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 1
37246 zVtDaUn6iYg 2018-06-14 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 195551 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 2

762 rows × 30 columns

In [56]:
df['likes'].max() #Maximum likes count
Out[56]:
2912710
In [57]:
df.loc[df["likes"]==2912710]# Video with max no. of likes
Out[57]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
5408 FlsCjmMhFmw 2017-12-12 YouTube Rewind: The Shape of 2017 | #YouTubeRe... YouTube Spotlight 24 2017-12-06 17:58:51+00:00 Rewind|"Rewind 2017"|"youtube rewind 2017"|"#Y... 125432237 2912710 1545017 ... 12 2017 6 12 2017 Wednesday 17 False 2017-12-06 6

1 rows × 30 columns

In [58]:
fig=px.strip(data_frame=df, x=df["likes"]) #Likes Distribution
fig.show()

Dislikes

In [59]:
df['dislikes'].min() #Minimum dislikes count
Out[59]:
0
In [60]:
df.loc[df["dislikes"]==0] #Videos with zero dislikes
Out[60]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
34 jElRtesCnlA 2017-11-14 Breaking News IT Raid - நடந்தது என்ன? சிக்கியத... Sathiyam News 25 2017-11-13 07:59:26+00:00 sathiyam news today|"sathiyam news live today"... 33533 0 0 ... 11 2017 13 11 2017 Monday 7 False 2017-11-13 1
36 qP67alYxSiU 2017-11-14 కెమెరాలో రికార్డ్ అయిన ఈ అద్భుతాన్ని చూస్తే ఆశ... HOTNEWS TELUGU 26 2017-11-12 03:53:49+00:00 Miracles Caught on Camera|"Miracles Caught on ... 128851 0 0 ... 11 2017 12 11 2017 Sunday 3 False 2017-11-12 2
43 q0-SCcsuSSs 2017-11-14 Akkineni Nagarjuna Response On Annapurna Studi... I News 25 2017-11-13 15:30:47+00:00 I news|"i news telugu channel"|"i news youtube... 23932 0 0 ... 11 2017 13 11 2017 Monday 15 False 2017-11-13 1
70 7BUEKSFwk7Q 2017-11-14 Mission IAS 2018 : 13 November, 2017 The Hindu... only ias 27 2017-11-13 03:13:24+00:00 [none] 23114 0 0 ... 11 2017 13 11 2017 Monday 3 False 2017-11-13 1
92 3BF-iAMjciE 2017-11-14 मृत अनिकेत कोथळेंच्या मुलीचा आर्त सवाल ZEE 24 TAAS 25 2017-11-12 08:12:19+00:00 zee24taas|"marathi video"|"marathi news"|"Brea... 37616 17 0 ... 11 2017 12 11 2017 Sunday 8 False 2017-11-12 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
36833 QEiskdHs8I8 2018-06-11 Gangster Sampat Nehra arrested by Haryana poli... TV9 Telugu 25 2018-06-10 08:51:52+00:00 tv9|"tv9 live"|"tv9 news"|"live tv9"|"tv9 telu... 89609 0 0 ... 6 2018 10 6 2018 Sunday 8 False 2018-06-10 1
36950 ihOPILF66Z8 2018-06-12 I AM 30 | Sindhu Loknath | V Vikas | Vikram Yo... SAKKATH STUDIO 24 2018-06-11 02:30:02+00:00 loose connection|"loose connection kannada"|"l... 39697 0 0 ... 6 2018 11 6 2018 Monday 2 False 2018-06-11 1
37038 zVtDaUn6iYg 2018-06-13 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 154412 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 1
37158 zVtDaUn6iYg 2018-06-13 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 154412 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 1
37246 zVtDaUn6iYg 2018-06-14 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 195551 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 2

771 rows × 30 columns

It is safe to say that just because dislikes are zero it doesnot mean the same video should have zero likes. There are about 9 extra rows in such way.

In [61]:
df['dislikes'].max() #Maximum dislikes count
Out[61]:
1545017
In [62]:
df.loc[df["dislikes"]==1545017]# Video with max no. of dislikes
Out[62]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
5408 FlsCjmMhFmw 2017-12-12 YouTube Rewind: The Shape of 2017 | #YouTubeRe... YouTube Spotlight 24 2017-12-06 17:58:51+00:00 Rewind|"Rewind 2017"|"youtube rewind 2017"|"#Y... 125432237 2912710 1545017 ... 12 2017 6 12 2017 Wednesday 17 False 2017-12-06 6

1 rows × 30 columns

In [63]:
fig=px.strip(data_frame=df, x=df["dislikes"]) #Dislikes Distribution
fig.show()

Comment_count

In [64]:
df['comment_count'].min() #Minimum comments count
Out[64]:
0
In [65]:
df.loc[df["comment_count"]==0] #Videos with zero comments
Out[65]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
1 zUZ1z7FwLc8 2017-11-14 पीरियड्स के समय, पेट पर पति करता ऐसा, देखकर दं... HJ NEWS 25 2017-11-13 05:43:56+00:00 पीरियड्स के समय|"पेट पर पति करता ऐसा"|"देखकर द... 590101 735 904 ... 11 2017 13 11 2017 Monday 5 False 2017-11-13 1
12 Prb_osSVE0M 2017-11-14 Renu Desai Gives Rating For Pawan kalyan As Hu... ABN Telugu 25 2017-11-13 09:51:59+00:00 Resnu desai|"actress renu desai"|"pawan kalyan... 156085 716 53 ... 11 2017 13 11 2017 Monday 9 False 2017-11-13 1
36 qP67alYxSiU 2017-11-14 కెమెరాలో రికార్డ్ అయిన ఈ అద్భుతాన్ని చూస్తే ఆశ... HOTNEWS TELUGU 26 2017-11-12 03:53:49+00:00 Miracles Caught on Camera|"Miracles Caught on ... 128851 0 0 ... 11 2017 12 11 2017 Sunday 3 False 2017-11-12 2
70 7BUEKSFwk7Q 2017-11-14 Mission IAS 2018 : 13 November, 2017 The Hindu... only ias 27 2017-11-13 03:13:24+00:00 [none] 23114 0 0 ... 11 2017 13 11 2017 Monday 3 False 2017-11-13 1
82 XU6iH5LHwDw 2017-11-14 హిజ్రాల నుంచి ఈ వస్తువు తీసుకుంటే మీ జీవితం మా... Mana Telugu 22 2017-11-12 04:30:00+00:00 వారి|"దగ్గరి"|"నుండి"|"ఒక్క"|"వస్తువు"|"తీసుకో... 65054 349 28 ... 11 2017 12 11 2017 Sunday 4 False 2017-11-12 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
36213 jbDufua0fFo 2018-06-08 Things SIKH people are tired of hearing - Neve... Old Delhi Films 24 2018-06-07 05:03:55+00:00 sikhs|"indian sikhs"|"sardar ji"|"sardar"|"sar... 58774 0 0 ... 6 2018 7 6 2018 Thursday 5 False 2018-06-07 1
36380 69v22QLmyAA 2018-06-09 Vidya Vinayaka - ವಿದ್ಯಾ ವಿನಾಯಕ | Episode - 160... Zee Kannada 24 2018-06-08 02:00:29+00:00 vidya vinayaka serial yesterday episode|"vidya... 19582 30 3 ... 6 2018 8 6 2018 Friday 2 False 2018-06-08 1
36598 pq-eP_RCclI 2018-06-10 ఏసీల్లో కూర్చొని ఎవరైనా మాట్లాడుతారు.. Sakshi TV 25 2018-06-10 07:10:32+00:00 hero vishal|"actor vishal"|"abhimanyudu movie"... 95939 0 0 ... 6 2018 10 6 2018 Sunday 7 False 2018-06-10 0
36729 79GMkkOzP3Q 2018-06-11 മഞ്ജുവിനെ ആശ്വസിപ്പിക്കാൻ ദിലീപ് മീനാക്ഷിക്കൊപ... Entertainment Journalist 24 2018-06-10 18:21:38+00:00 [none] 55483 51 19 ... 6 2018 10 6 2018 Sunday 18 False 2018-06-10 1
36771 pq-eP_RCclI 2018-06-11 ఏసీల్లో కూర్చొని ఎవరైనా మాట్లాడుతారు.. Sakshi TV 25 2018-06-10 07:10:32+00:00 hero vishal|"actor vishal"|"abhimanyudu movie"... 117357 0 0 ... 6 2018 10 6 2018 Sunday 7 False 2018-06-10 1

1302 rows × 30 columns

In [66]:
df['comment_count'].max() #Maximum comments count
Out[66]:
827755
In [67]:
df.loc[df["comment_count"]==827755]# Video with max no. of comments
Out[67]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
4743 FlsCjmMhFmw 2017-12-09 YouTube Rewind: The Shape of 2017 | #YouTubeRe... YouTube Spotlight 24 2017-12-06 17:58:51+00:00 Rewind|"Rewind 2017"|"youtube rewind 2017"|"#Y... 75969469 2251815 1127805 ... 12 2017 6 12 2017 Wednesday 17 False 2017-12-06 3

1 rows × 30 columns

In [68]:
fig=px.strip(data_frame=df, x=df["comment_count"]) #comments count distribution
fig.show()
In [69]:
df.loc[df["comment_count"]>df["likes"]] #videos with more comments than likes
Out[69]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
34 jElRtesCnlA 2017-11-14 Breaking News IT Raid - நடந்தது என்ன? சிக்கியத... Sathiyam News 25 2017-11-13 07:59:26+00:00 sathiyam news today|"sathiyam news live today"... 33533 0 0 ... 11 2017 13 11 2017 Monday 7 False 2017-11-13 1
43 q0-SCcsuSSs 2017-11-14 Akkineni Nagarjuna Response On Annapurna Studi... I News 25 2017-11-13 15:30:47+00:00 I news|"i news telugu channel"|"i news youtube... 23932 0 0 ... 11 2017 13 11 2017 Monday 15 False 2017-11-13 1
128 fRY7UWGFvdc 2017-11-14 ഇടുക്കി ആശുപത്രിയിൽ രോഗികളെ ക്യൂവിൽ നിർത്തിയ ജ... Malayalam News Time 22 2017-11-12 11:04:59+00:00 [none] 197122 0 0 ... 11 2017 12 11 2017 Sunday 11 False 2017-11-12 2
149 t_vpgs7aDco 2017-11-14 Real Fight Between Hero Rajendra Prasad and Vi... Tollywood Mirapakai 24 2017-11-13 04:45:33+00:00 Real Fight Between Hero Rajendra Prasad and Vi... 19567 171 28 ... 11 2017 13 11 2017 Monday 4 False 2017-11-13 1
158 ifqJNMRupEU 2017-11-14 பாகுபலி பிரபாசை போன்று யானையிடம் செய்ய முயன்றவ... News VIdeo 22 2017-11-13 12:10:32+00:00 [none] 7573 1 1 ... 11 2017 13 11 2017 Monday 12 False 2017-11-13 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37038 zVtDaUn6iYg 2018-06-13 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 154412 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 1
37155 smcUETnz2xY 2018-06-13 Sri Reddy Reacts On Nani's Legal Notices | Fil... Film Jalsa 24 2018-06-11 14:44:03+00:00 Film Jalsa|"Sri Reddy Reacts On Nani's Legal N... 180791 610 2783 ... 6 2018 11 6 2018 Monday 14 False 2018-06-11 2
37158 zVtDaUn6iYg 2018-06-13 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 154412 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 1
37245 smcUETnz2xY 2018-06-14 Sri Reddy Reacts On Nani's Legal Notices | Fil... Film Jalsa 24 2018-06-11 14:44:03+00:00 Film Jalsa|"Sri Reddy Reacts On Nani's Legal N... 313505 917 3977 ... 6 2018 11 6 2018 Monday 14 False 2018-06-11 3
37246 zVtDaUn6iYg 2018-06-14 நான் ஆமைக்கறி சாப்பிட்டுருக்கிறேன் - மனம் திறக... Radio Mirchi Tamil 24 2018-06-12 05:35:35+00:00 Kaala|"seeman"|"duraimurugan"|"ops"|"paranjith... 195551 0 0 ... 6 2018 12 6 2018 Tuesday 5 False 2018-06-12 2

641 rows × 30 columns

In [70]:
df.loc[df["comment_count"]>df["dislikes"]] #videos with more comments than dislikes
Out[70]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes ... trending_month trending_year publish_day publish_month publish_year publishing_day publishing_hour temp publish_date days_before_trend
0 kzwfHumJyYc 2017-11-14 Sharry Mann: Cute Munda ( Song Teaser) | Parmi... Lokdhun Punjabi 1 2017-11-12 12:20:39+00:00 sharry mann|"sharry mann new song"|"sharry man... 1096327 33966 798 ... 11 2017 12 11 2017 Sunday 12 False 2017-11-12 2
3 N1vE8iiEg64 2017-11-14 Eruma Saani | Tamil vs English Eruma Saani 23 2017-11-12 07:08:48+00:00 Eruma Saani|"Tamil Comedy Videos"|"Films"|"Mov... 1242680 70353 1624 ... 11 2017 12 11 2017 Sunday 7 False 2017-11-12 2
5 il_pSa5l98w 2017-11-14 MCA (Middle Class Abbayi) TEASER - Nani,Sai Pa... Dil Raju 24 2017-11-10 04:29:50+00:00 Nenu Local|"Nenu Local Telugu Movie"|"Nani"|"S... 6106669 98612 4185 ... 11 2017 10 11 2017 Friday 4 False 2017-11-10 4
6 7MxiQ4v0EnE 2017-11-14 Daang ( Full Video ) | Mankirt Aulakh | Sukh S... Speed Records 10 2017-11-11 16:41:15+00:00 punjabi songs|"punjabi bhangra"|"punjabi music... 5718766 127477 7134 ... 11 2017 11 11 2017 Saturday 16 False 2017-11-11 3
7 c64I9HNpiOY 2017-11-14 Padmavati : Ek Dil Ek Jaan Video Song | Deepik... T-Series 10 2017-11-11 06:14:19+00:00 Ek Dil Ek Jaan Video Song|"'Ek Dil Ek Jaan'"|"... 10588371 132738 8812 ... 11 2017 11 11 2017 Saturday 6 False 2017-11-11 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
37343 zIvfX9vc0Nw 2018-06-14 THINGS GUJARATIS ARE TIRED OF HEARING FT. Tath... Hasley India 23 2018-06-11 13:17:24+00:00 hasley|"india"|"harsh"|"beniwal"|"rishhsome"|"... 447755 23896 1304 ... 6 2018 11 6 2018 Monday 13 False 2018-06-11 3
37344 5gd1G0aeqtw 2018-06-14 If everything on the internet was true | Comed... Dreamz Unlimited 24 2018-06-13 15:33:32+00:00 nagaland|"northeast"|"facebook"|"whatsapp"|"so... 56313 6687 57 ... 6 2018 13 6 2018 Wednesday 15 False 2018-06-13 1
37346 8aMckaI9k00 2018-06-14 Indore Preview- Chacha Vidhayak Hai Humare - Z... Zakir Khan 22 2018-06-13 14:18:31+00:00 [none] 274861 16448 183 ... 6 2018 13 6 2018 Wednesday 14 False 2018-06-13 1
37348 dpPmPbhcslM 2018-06-14 Seetha | Flowers | Ep# 364 Flowers TV 24 2018-06-13 11:30:04+00:00 flowers serials|"actress"|"malayalam serials"|... 406828 1726 478 ... 6 2018 13 6 2018 Wednesday 11 False 2018-06-13 1
37349 mV6aztP58f8 2018-06-14 Bhramanam I Episode 87 - 12 June 2018 I Mazhav... Mazhavil Manorama 24 2018-06-13 05:00:02+00:00 mazhavil manorama|"bhramanam full episode"|"gt... 386319 1216 453 ... 6 2018 13 6 2018 Wednesday 5 False 2018-06-13 1

17536 rows × 30 columns

Results:

  1. All the views,likes,dislikes and the comments count is maximum only in the case of one video from the channel "Youtube Spotlight." The reason for it is due to the fact that it is an extremum and an outlier in every situation.
  2. The views can never be zero if it is on the trending list. And it is not necessary that all videos that have zero dislikes have zero likes but the opposite is more than possible.
  3. Comments count have more number of records that have zero than both likes and dislikes columns.

Unsupervised Learning

Theory behind the Unsupervise learning algorithms

Unsupervised learning algorithms are used when there is no target variable. Here we want to classify our youtube videos based on the closeness of the data. And the following algorithms use distance as a measure to calculate the clusters.

Kmeans Clustering The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. Hence the no. of clusters is calculated using the elbow method. The below WCSS graph is calculated for each value of k, i.e from 1 to 11 and optimised k is taken where there is an elbow point. In our case, the optimal number of clusters are 5 and so n=5.

Hierarchical Agglomerative Clustering Hierarchical clustering is a general family of clustering algorithms that build nested clusters by merging or splitting them successively. This hierarchy of clusters is represented as a tree (or dendrogram). The root of the tree is the unique cluster that gathers all the samples, the leaves being the clusters with only one sample. Ward minimizes the sum of squared differences within all clusters. It is a variance-minimizing approach and in this sense is similar to the k-means objective function but tackled with an agglomerative hierarchical approach.

Metrics

  1. Silhouette Coefficient: It attempts to describe how similar a datapoint is to other datapoints in its cluster, relative to datapoints not in its cluster (this is aggregated over all datapoints to get the score for an overall clustering). In other words, it thinks about how ‘distinct’ the clusters are in space — indeed one could use any measure of ‘distance’ to calculate the score. It is bounded between -1 and 1. Closer to -1 suggests incorrect clustering, while closer to +1 shows that each cluster is very dense.
  2. Calinski and Harabasz score: The Calinski Harabaz Index is the ratio of the variance of a datapoint compared to points in other clusters, against the variance compared to points within its cluster. Since we want this first part to be high, and the second part to be low, a high CH index is desirable. Unlike other metrics we have seen, this score is not bounded.

The above info is obtained from:

https://towardsdatascience.com/how-to-evaluate-unsupervised-learning-models-3aa85bd98aa2

https://scikit-learn.org/stable/modules/clustering.html#

K means Clustering

In [71]:
from sklearn import metrics
In [112]:
ndf=df[["video_id","likes","dislikes","views","comment_count","trending_days"]]
ndf.shape
Out[112]:
(36841, 6)
In [113]:
x=ndf.values
x=x[:,1:]
x
Out[113]:
array([[33966, 798, 1096327, 882, 3],
       [735, 904, 590101, 0, 6],
       [2011, 243, 473988, 149, 4],
       ...,
       [1216, 453, 386319, 697, 2],
       [698, 115, 130263, 65, 2],
       [22466, 1609, 1278249, 1205, 10]], dtype=object)
In [114]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
In [115]:
Standardisation = StandardScaler() 
  
# Scaled feature 
nx = Standardisation.fit_transform(x) 
nx
Out[115]:
array([[ 0.07168756, -0.05368428,  0.01318341, -0.12005327, -0.2265826 ],
       [-0.26945984, -0.04713375, -0.14601198, -0.17902811,  1.03335846],
       [-0.25636051, -0.08798189, -0.18252661, -0.16906524,  0.19339776],
       ...,
       [-0.26452193, -0.07500442, -0.21009631, -0.13242328, -0.64656295],
       [-0.26983968, -0.09589197, -0.2906195 , -0.17468189, -0.64656295],
       [-0.04637074, -0.00356651,  0.07039332, -0.09845591,  2.71327988]])
In [76]:
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
wcss=[]

for i in range(1,11):
    kmeans=KMeans(n_clusters=i,init='k-means++',random_state=0)
    kmeans.fit(nx)
    wcss.append(kmeans.inertia_)#in the kmeans package wcss is called as inertia_
plt.plot(range(1,11),wcss)
plt.title("The elbow method")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()
In [116]:
pipe= KMeans(n_clusters=5,init='k-means++',random_state=0)
ymeans=pipe.fit_predict(nx)
In [78]:
metrics.silhouette_score(nx, ymeans)
Out[78]:
0.5656697145812638
In [79]:
metrics.calinski_harabasz_score(nx, ymeans)
Out[79]:
28542.987137143806
In [80]:
ymeans
Out[80]:
array([0, 1, 0, ..., 0, 0, 1], dtype=int32)
In [117]:
ndf['Cluster1'] = ymeans
ndf
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[117]:
video_id likes dislikes views comment_count trending_days Cluster1
0 kzwfHumJyYc 33966 798 1096327 882 3 0
1 zUZ1z7FwLc8 735 904 590101 0 6 1
2 10L1hZ9qa58 2011 243 473988 149 4 0
3 N1vE8iiEg64 70353 1624 1242680 2684 4 0
4 kJzGH0PVQHQ 492 293 464015 66 5 1
... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 3291 404 214378 196 2 0
37348 dpPmPbhcslM 1726 478 406828 1428 2 0
37349 mV6aztP58f8 1216 453 386319 697 2 0
37350 qxqDNP1bDEw 698 115 130263 65 2 0
37351 wERgpPK44w0 22466 1609 1278249 1205 10 1

36841 rows × 7 columns

In [82]:
arr1=["Cluster 1","Cluster 2","Cluster 5","Cluster 3","Cluster 4"]
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(ndf.loc[ndf["Cluster1"] == arr[i]].shape[0]) 
fig = px.pie(ndf, values=sum_of_videos, names=arr1,title="Percent of each cluster")
fig.show()
In [83]:
print(sum_of_videos)
[26428, 9293, 5, 1054, 61]
In [84]:
fig = px.scatter(ndf, x="views", y="likes", color="Cluster1",
                 size='comment_count', hover_data=['dislikes','trending_days'])
fig.show()
In [85]:
temp=ndf["video_id"].loc[ndf["Cluster1"]==2] #All items in Cluster 2 comes from one video
temp
Out[85]:
4568    FlsCjmMhFmw
4743    FlsCjmMhFmw
4936    FlsCjmMhFmw
5119    FlsCjmMhFmw
5408    FlsCjmMhFmw
Name: video_id, dtype: object
In [86]:
sns.heatmap(ndf.corr(),annot=True)
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3e38331ad0>

Hierarchical Agglomerative Clustering

In [120]:
from sklearn.cluster import AgglomerativeClustering
In [121]:
pipe= AgglomerativeClustering(n_clusters=5, linkage='ward') #default= ward linkage
ymeans=pipe.fit_predict(nx)
In [122]:
ndf['Cluster2'] = ymeans
ndf
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[122]:
video_id likes dislikes views comment_count trending_days Cluster1 Cluster2
0 kzwfHumJyYc 33966 798 1096327 882 3 0 0
1 zUZ1z7FwLc8 735 904 590101 0 6 1 0
2 10L1hZ9qa58 2011 243 473988 149 4 0 0
3 N1vE8iiEg64 70353 1624 1242680 2684 4 0 0
4 kJzGH0PVQHQ 492 293 464015 66 5 1 0
... ... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 3291 404 214378 196 2 0 4
37348 dpPmPbhcslM 1726 478 406828 1428 2 0 4
37349 mV6aztP58f8 1216 453 386319 697 2 0 4
37350 qxqDNP1bDEw 698 115 130263 65 2 0 4
37351 wERgpPK44w0 22466 1609 1278249 1205 10 1 0

36841 rows × 8 columns

In [107]:
arr1=["Cluster 1","Cluster 3","Cluster 4","Cluster 5","Cluster 2"]
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(ndf.loc[ndf["Cluster2"] == arr[i]].shape[0]) 
fig = px.pie(ndf, values=sum_of_videos, names=arr1,title="Percent of each cluster")
fig.show()
In [124]:
print(sum_of_videos)
[19733, 1464, 103, 5, 15536]
In [109]:
fig = px.scatter(ndf, x="views", y="likes", color="Cluster2",
                 size='comment_count', hover_data=['dislikes','trending_days'])
fig.show()
In [123]:
sns.heatmap(ndf.corr(),annot=True)
Out[123]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3e29ef1190>
In [94]:
metrics.silhouette_score(nx, ymeans)
Out[94]:
0.4236194209697896
In [95]:
metrics.calinski_harabasz_score(nx, ymeans)
Out[95]:
23368.66888399611
In [96]:
pipe= AgglomerativeClustering(n_clusters=5, linkage='complete') #complete linkage
ymeans=pipe.fit_predict(nx)
In [97]:
ndf['Cluster3'] = ymeans
ndf
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[97]:
video_id likes dislikes views comment_count trending_days Cluster1 Cluster2 Cluster3
0 kzwfHumJyYc 33966 798 1096327 882 3 0 0 0
1 zUZ1z7FwLc8 735 904 590101 0 6 1 0 0
2 10L1hZ9qa58 2011 243 473988 149 4 0 0 0
3 N1vE8iiEg64 70353 1624 1242680 2684 4 0 0 0
4 kJzGH0PVQHQ 492 293 464015 66 5 1 0 0
... ... ... ... ... ... ... ... ... ...
37347 iNHecA3PJCo 3291 404 214378 196 2 0 4 0
37348 dpPmPbhcslM 1726 478 406828 1428 2 0 4 0
37349 mV6aztP58f8 1216 453 386319 697 2 0 4 0
37350 qxqDNP1bDEw 698 115 130263 65 2 0 4 0
37351 wERgpPK44w0 22466 1609 1278249 1205 10 1 0 0

36841 rows × 9 columns

In [98]:
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(ndf.loc[ndf["Cluster3"] == arr[i]].shape[0]) 
print(sum_of_videos)
[36828, 2, 3, 7, 1]
In [99]:
metrics.silhouette_score(nx, ymeans)
Out[99]:
0.9609986114073137
In [100]:
metrics.calinski_harabasz_score(nx, ymeans)
Out[100]:
5204.754485760965
In [101]:
pipe= AgglomerativeClustering(n_clusters=5, linkage='average') #average linkage
ymeans=pipe.fit_predict(nx)
In [102]:
ndf['Cluster4'] = ymeans
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(ndf.loc[ndf["Cluster4"] == arr[i]].shape[0]) 
print(sum_of_videos)
[36795, 40, 1, 3, 2]
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [103]:
metrics.silhouette_score(nx, ymeans)
Out[103]:
0.9367040684201027
In [104]:
metrics.calinski_harabasz_score(nx, ymeans)
Out[104]:
6934.231040010465
In [105]:
pipe= AgglomerativeClustering(n_clusters=5, linkage='single') #single linkage
ymeans=pipe.fit_predict(nx)
ndf['Cluster5'] = ymeans
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
    sum_of_videos.append(ndf.loc[ndf["Cluster5"] == arr[i]].shape[0]) 
print(sum_of_videos)
metrics.silhouette_score(nx, ymeans)
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

[3, 1, 36835, 1, 1]
Out[105]:
0.9645890646793039
In [106]:
metrics.calinski_harabasz_score(nx, ymeans)
Out[106]:
3908.250072174032

Conclusion

  1. All the linkages in hierarchical clustering were observed and ward linkage proved to give a best distribution of the data. Although for 'average' linkage, silhouette cofficient gave about 96%. It was seen that clusters were not distributed properly and hence the calinski harabasz score was just low. This was the same case with all other linkages and hence, 'ward' linkage was chosen as it seemed closer to the real world.
  2. The metrics were compared and it was seen that 'kmeans' algorithm was slightly better than the 'ward' linkage hierarchical clustering.